rm(list=ls())
source("functions.R")
require(quanteda)
require(topicmodels)
require(ldatuning)

dict <- dictionary(file = "topic_candidate.yml")
toks <- readRDS("data/data_tokens_sent.RDS") %>% 
  tokens_subset(year >= 1991)
toks <- tokens_compound(toks, dict, join = FALSE, concatenator = " ")
mt <- dfm(toks, remove = c("", stopwords("en"))) %>% 
  dfm_trim(min_termfreq = 10)

if (TRUE) {
  tune <- FindTopicsNumber(convert(mt, "topicmodels"), 
                           metrics = c("Griffiths2004"),
                           topics = seq(5, 50, 5), 
                           control = list(seed = 1234), mc.cores = 6)
  saveRDS(tune, "lda_tuning.RDS")
}

if (TRUE) {
  saveRDS(LDA(convert(mt, "topicmodels"), control = list(seed = 1234), k = 5), "lda_k5.RDS")
  saveRDS(LDA(convert(mt, "topicmodels"), control = list(seed = 1234), k = 6), "lda_k6.RDS")
  saveRDS(LDA(convert(mt, "topicmodels"), control = list(seed = 1234), k = 10), "lda_k10.RDS")
  saveRDS(LDA(convert(mt, "topicmodels"), control = list(seed = 1234), k = 15), "lda_k15.RDS")
  saveRDS(LDA(convert(mt, "topicmodels"), control = list(seed = 1234), k = 20), "lda_k20.RDS")
}

weight <- 500
scheme <- "absolute"

pred <- list()
pred[["knowledge"]] <- local({
  slda <- textmodel_slda(mt, tfm(mt, dict[["knowledge"]], 1, weight = weight, 
                                 scheme = scheme, residual = FALSE))
  saveRDS(slda, "slda_knowledge.RDS")
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  topic <- topics(slda)
  dat$topic <- topic[match(row.names(dat), names(topic))]
  dat$topic <- rownames(slda@seedwords)[dat$topic]
  return(dat)
})

pred[["frequency"]] <- local({
  slda <- textmodel_slda(mt, tfm(mt, dict[["frequency"]], 1, weight = weight, 
                                 scheme = scheme, residual = FALSE))
  saveRDS(slda, "slda_frequency.RDS")
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  topic <- topics(slda)
  dat$topic <- topic[match(row.names(dat), names(topic))]
  dat$topic <- rownames(slda@seedwords)[dat$topic]
  return(dat)
})

pred[["all"]] <- local({
  slda <- textmodel_slda(mt, tfm(mt, dict, 2, weight = weight, 
                                 scheme = scheme, residual = FALSE))
  saveRDS(slda, "slda_all.RDS")
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  topic <- topics(slda)
  dat$topic <- topic[match(row.names(dat), names(topic))]
  dat$topic <- rownames(slda@seedwords)[dat$topic]
  return(dat)
})

pred[["low"]] <- local({
  dict <- dictionary(file = "topic_low-entropy.yml")
  slda <- textmodel_slda(mt, tfm(mt, dict, 1, weight = weight, 
                                 scheme = scheme, residual = FALSE))
  saveRDS(slda, "slda_low.RDS")
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  topic <- topics(slda)
  dat$topic <- topic[match(row.names(dat), names(topic))]
  dat$topic <- rownames(slda@seedwords)[dat$topic]
  return(dat)
})

pred[["high"]] <- local({
  dict <- dictionary(file = "topic_high-entropy.yml")
  slda <- textmodel_slda(mt, tfm(mt, dict, 1, weight = weight, 
                                 scheme = scheme, residual = FALSE))
  saveRDS(slda, "slda_high.RDS")
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  topic <- topics(slda)
  dat$topic <- topic[match(row.names(dat), names(topic))]
  dat$topic <- rownames(slda@seedwords)[dat$topic]
  return(dat)
})

saveRDS(pred, "data_prediction_slda.RDS")
